function [output] = sum_collate(dir_tmp,idx)
%This function will go through a set of properties files, calculates
%properties for each file, sort files into groups (based on same name) and
%output each group as a single file.  Should be used in conjunction with
%join_verts.
%Synatax:   [data] = sum_collate(dir_tmp);
%Input:     idx = the property column to use as the filter group
%               use sum_collate([],idx) to call with index only
%           dir_tmp = the directory of interest, special case, enter a
%               number in dir_tmp, and the function will allow you to
%               multi-select files instead.
%Output:    output = the data set. Sturucture with fields: data = the
%               summary data from each file and name = the filenames in 
%               order.

multiselect = 0;    %off by default
filenames = [];

if nargin==0    %no input
    idx = [];
    prompt_box('title','Open File Location','prompt1','Select the vertex lists you want to combine.','position','center');
    pause(0.25);
    dir_tmp = uigetdir2('','Directory where the files are located');    %get the directory
else    %one or more
    if isnumeric(dir_tmp)   %special case
        multiselect = 1;  %on
        [filenames,dir_tmp,filterindex] = uigetfile2({'*.csv','Text files (*.csv)';'*.*','All Files';},...
            'Open Files for summary','Multiselect','on');
        if ~iscell(filenames)       %make sure filenames is a cell array
            filenames = filenames{filenames};
        end
        idx = [];
    else    %regularly schedule function
        if isempty(dir_tmp)     %no dir
            prompt_box('title','Open File Location','prompt1','Select the vertex lists you want to combine.','position','center');
            pause(0.25);
            dir_tmp = uigetdir2('','Directory where the files are located');    %get the directory
        else    %no idx
            idx = [];
        end
    end
end

%Now sort through the directory and find the
[filenames,names,size_array] = dir_sort(dir_tmp,multiselect,filenames);
%get the largest number of files groups
max_size = max(size_array);
data_col = [];  %initialize array
qdata_col = [];
mdata_col = [];
dataset_name = [];
col_name = [];
filecount = 0;
%Now open each file append and save
mkdir(dir_tmp,'sum_collated');
mkdir(dir_tmp,'sum_collated_quantile');
mkdir(dir_tmp,'sum_collated_median');
for j = 1:size(filenames,2) %go through the file sets
    filename_tmp = filenames{j};
    data = [];  %initialize/reset
    data_col_tmp = [];
    qdata = [];  %initialize/reset
    qdata_col_tmp = [];
    mdata = [];  %initialize/reset
    mdata_col_tmp = [];
    %there are two modes, horizontal collate mode of only 1 instance of a channel, and vertical for more than 1, as in ur_analysis
    if size(filename_tmp,1)>1  %prepare the observation naming array
        col_name_tmp = {[names{j},'.ave'];[names{j},'.std'];[names{j},'.size'];[names{j},'.med'];[names{j},'.mad']}; 
    end
    for i = 1:size(filename_tmp,1)      %for each file generate statistics
        try     %make sure the file is not empty
            prop_tmp = single(dlmread([dir_tmp,filesep,filename_tmp(i,:)],',',1,0));    %open the file
        end
        if isempty(data_col_tmp)    %first time through generate temp for collated data
            %this is complicated but here goes
            if isempty(prop_tmp)&&i==1  %first time through and no data
                if isempty(data_col)    %no priors
                    fid = fopen([dir_tmp,filesep,filename_tmp(i,:)]);    %open the file
                    str = textscan(fid,'%s');   %grab the headings
                    str = str{1}{1};   %convert to string
                    col_num = find(str==',');   %assuming comma delimted files
                    col_num = size(col_num,2)+1;    %number of columns
                else    %there are priors
                    col_num = size(data_col,2);
                end
                data_col_tmp = zeros(4,col_num,max_size);   %there!
                %normalized datasets
                qdata_col_tmp = zeros(4,col_num,max_size);   %there!
                mdata_col_tmp = zeros(4,col_num,max_size);   %there!
            elseif isempty(prop_tmp)    %not the first time through
                data_col_tmp = zeros(4,size(data_col_tmp,2),max_size);
                %normalized datasets
                qdata_col_tmp = zeros(4,size(data_col_tmp,2),max_size);
                mdata_col_tmp = zeros(4,size(data_col_tmp,2),max_size);
            else
                data_col_tmp = zeros(4,size(prop_tmp,2),max_size);   %temporary array for holding properites data
                %normalized datasets
                qdata_col_tmp = zeros(4,size(prop_tmp,2),max_size);
                mdata_col_tmp = zeros(4,size(prop_tmp,2),max_size);
            end
        end
        if ~isempty(prop_tmp)   %if there is data proceed
            %the data matrix is 5 statistic by the number of properties by
            %number of instances of that channel or type
            %now remove our NaN
            prop_tmp(isnan(prop_tmp(:,1)),:) = [];
            if isempty(prop_tmp)    %removed it all
                prop_tmp = nan(1,size(data_col_tmp,2));
            end
            %generate statistics
            if isempty(idx)
                idx = sum(prop_tmp(:,:));    %grab the first few point for a little test, we pick 5, because changes low that all 5 are not deci
                idx_int = round(idx);   %convert to integer, the size meteric should be the only integer in the lot
                idx_tmp = idx==idx_int;     %where is the int?
                idx_tmp = idx_tmp+(median(prop_tmp,1)<100);   %intensity measurements are huge
                idx_tmp = idx_tmp+(idx~=0);     %cannot be zero, size that is
                idx = find(idx_tmp==3);     %find it, meet all criteria, done
                if size(idx,2)>1    %the wild case where the two made it through
                    idx = 2;    %default only really works for me really
                end
            end
            %now find the outliers and remove the data
            %first calculate our criteria
            out = median(prop_tmp(:,idx))+std(prop_tmp(:,idx))*3;     %3 times the standard deviation should do it.
            [x,y] = find(prop_tmp(:,idx)>out);        %find the outliers.
            prop_tmp(x,:) = [];             %remove the outliers
            %now remove our NaN
            prop_tmp(isnan(prop_tmp(:,1)),:) = [];
            %do some math on the raw data
            data(1,:,i) = mean(prop_tmp,1);
            data(2,:,i) = std(prop_tmp,1);
            data(3,:,i) = size(prop_tmp,1);
            data(4,:,i) = median(prop_tmp,1);
            data(5,:,i) = mad(prop_tmp,1,1);
            %now create the collated dataset
            %first grab the first number of the filename, which should be the
            %experiment index of the files
            %         file_idx = str2num(filename_tmp(i,1));
            %put data into data array
            %         data_col_tmp(1,:,file_idx) = mean(prop_tmp);
            %         data_col_tmp(2,:,file_idx) = std(prop_tmp);
            %         data_col_tmp(3,:,file_idx) = size(prop_tmp,1);
            %         data_col_tmp(4,:,file_idx) = median(prop_tmp);
            data_col_tmp(1,:,i) = mean(prop_tmp,1);
            data_col_tmp(2,:,i) = std(prop_tmp,1);
            data_col_tmp(3,:,i) = size(prop_tmp,1);
            data_col_tmp(4,:,i) = median(prop_tmp,1);
            data_col_tmp(5,:,i) = mad(prop_tmp,1,1);
            %normalize
            mprop_tmp = manorm(prop_tmp,'Method','median','LogData',1);
            qprop_tmp = quantilenormloco(prop_tmp);
            %quantile norm
            qdata(1,:,i) = mean(qprop_tmp,1);
            qdata(2,:,i) = std(qprop_tmp,1);
            qdata(3,:,i) = size(qprop_tmp,1);
            qdata(4,:,i) = median(qprop_tmp,1);
            qdata(5,:,i) = mad(qprop_tmp,1,1);
            qdata_col_tmp(1,:,i) = mean(qprop_tmp,1);
            qdata_col_tmp(2,:,i) = std(qprop_tmp,1);
            qdata_col_tmp(3,:,i) = size(qprop_tmp,1);
            qdata_col_tmp(4,:,i) = median(qprop_tmp,1);
            qdata_col_tmp(5,:,i) = mad(qprop_tmp,1,1);
            %median norm
            mdata(1,:,i) = mean(mprop_tmp,1);
            mdata(2,:,i) = std(mprop_tmp,1);
            mdata(3,:,i) = size(mprop_tmp,1);
            mdata(4,:,i) = median(mprop_tmp,1);
            mdata(5,:,i) = mad(mprop_tmp,1,1);
            mdata_col_tmp(1,:,i) = mean(mprop_tmp,1);
            mdata_col_tmp(2,:,i) = std(mprop_tmp,1);
            mdata_col_tmp(3,:,i) = size(mprop_tmp,1);
            mdata_col_tmp(4,:,i) = median(mprop_tmp,1);
            mdata_col_tmp(5,:,i) = mad(mprop_tmp,1,1);
        end
    end
    %now save out the data file
    warning 'off'
    for k = 1:size(prop_tmp,2)     %step through the properties
        filename_tmp = cellstr(filename_tmp);
        data_tmp = reshape(data(:,k,:),[5,size(data,3),1]);
        dataout = dataset({data_tmp,filename_tmp{:}});
        sav2csv(dataout,[names{j},'_col_sum.prop',num2str(k),'.csv'],[dir_tmp,'\sum_collated']);
        %normalized data-quantile
        qdata_tmp = reshape(qdata(:,k,:),[5,size(qdata,3),1]);
        qdataout = dataset({qdata_tmp,filename_tmp{:}});
        sav2csv(qdataout,[names{j},'_col_sum.prop',num2str(k),'.csv'],[dir_tmp,'\sum_collated_quantile']);
        %normalized data-median
        mdata_tmp = reshape(mdata(:,k,:),[5,size(mdata,3),1]);
        mdataout = dataset({mdata_tmp,filename_tmp{:}});
        sav2csv(mdataout,[names{j},'_col_sum.prop',num2str(k),'.csv'],[dir_tmp,'\sum_collated_median']);
    end
    output(j).data = data;
    %norm data
    output(j).qdata = qdata;
    output(j).mdata = mdata;
    %put collated data together
    data_col = vertcat(data_col,data_col_tmp);
    %normalized data
    qdata_col = vertcat(qdata_col,qdata_col_tmp);
    mdata_col = vertcat(mdata_col,mdata_col_tmp);
    if size(filename_tmp,1)>1  %horz collate
        col_name = vertcat(col_name,col_name_tmp);
    else    %vert collate
        dataset_name = horzcat(dataset_name,names(j));
    end
%     if filecount<size(filename_tmp,1);  %give me the number of datasets in this dataset
%         filecount = size(filename_tmp,1);
%         filecountidx = j;   %filenames location
%     end
end
output(1).names = names;
%create the filenames array for the collated data
max_loc = find(size_array==max(size_array));    %locations of where you have all of the datasets present
filename_tmp = filenames{max_loc(1)};   %any of the max sets is fine, we'll just take the first
strmask = isstrprop(filename_tmp,'punct');  %find the punctuations
%filenum_max = str2num(filename_tmp(size(filename_tmp,1),1));    %what is the max possible # of experiments
%curfile_ptr = 1;    %points to the current file
% for m = 1:filenum_max
%     [x,y] = find(strmask(curfile_ptr,:)==1);  %get the positions
%     %first grab the first number of the filename, which should be the
%     %experiment index of the files
%     if m==str2num(filename_tmp(curfile_ptr,1));   %if file mateches the experiment, move on ahead
%         dataset_name{m,1} = filename_tmp(curfile_ptr,1:y(1)-1);    %grab the first punctuation deliminted characters, which should be unique
%         curfile_ptr = curfile_ptr+1;    %iterate
%     else    %if not, place in a blank holder
%         dataset_name{m,1} = [num2str(m) ' no exp'];    %use m as the string
%     end
% end
for m = 1:max_size
    [x,y] = find(strmask(m,:)==1);  %get the positions
    if size(filename_tmp,1)>1 
        dataset_name{m,1} = filename_tmp(m,1:y(1)-1);    %grab the first punctuation deliminted characters, which should be unique
    end
end

if size(filename_tmp,1)==1 
    col_name = {'mean' 'std' 'count' 'median' 'mad'};
end

%save out the collated data
for l = 1:size(prop_tmp,2)     %step through the properties
    if size(filename_tmp,1)>1 
        col_cache = reshape(data_col(:,l,:),[size(data_col,1),size(data_col,3),1]);
        %norm data
        qcol_cache = reshape(qdata_col(:,l,:),[size(qdata_col,1),size(qdata_col,3),1]);
        mcol_cache = reshape(mdata_col(:,l,:),[size(mdata_col,1),size(mdata_col,3),1]);
        %save a version of the file that is easier to collate manually, a
        %raw version without all of the statistics, averages only
        %output average
        col_tmp = col_cache(1:5:end,:);
        col_name_tmp = col_name(1:5:end,:);
        col_data_tmp = dataset({col_tmp,dataset_name{:}},'obsname',col_name_tmp);
        sav2csv(col_data_tmp,['ave_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated']);
            %normalized data-quantile
            qcol_tmp = qcol_cache(1:5:end,:);
            qcol_data_tmp = dataset({qcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(qcol_data_tmp,['ave_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_quantile']);
            %median
            mcol_tmp = mcol_cache(1:5:end,:);
            mcol_data_tmp = dataset({mcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(mcol_data_tmp,['ave_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_median']);
        %output std
        col_tmp = col_cache(2:5:end,:);
        col_name_tmp = col_name(2:5:end,:);
        col_data_tmp = dataset({col_tmp,dataset_name{:}},'obsname',col_name_tmp);
        sav2csv(col_data_tmp,['std_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated']);
            %normalized data-quantile
            qcol_tmp = qcol_cache(2:5:end,:);
            qcol_data_tmp = dataset({qcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(qcol_data_tmp,['std_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_quantile']);
            %median
            mcol_tmp = mcol_cache(2:5:end,:);
            mcol_data_tmp = dataset({mcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(mcol_data_tmp,['std_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_median']);
        %output count
        col_tmp = col_cache(3:5:end,:);
        col_name_tmp = col_name(3:5:end,:);
        col_data_tmp = dataset({col_tmp,dataset_name{:}},'obsname',col_name_tmp);
        sav2csv(col_data_tmp,['count_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated']);
            %normalized data-quantile
            qcol_tmp = qcol_cache(3:5:end,:);
            qcol_data_tmp = dataset({qcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(qcol_data_tmp,['count_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_quantile']);
            %median
            mcol_tmp = mcol_cache(3:5:end,:);
            mcol_data_tmp = dataset({mcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(mcol_data_tmp,['count_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_median']);
        %output median
        col_tmp = col_cache(4:5:end,:);
        col_name_tmp = col_name(4:5:end,:);
        col_data_tmp = dataset({col_tmp,dataset_name{:}},'obsname',col_name_tmp);
        sav2csv(col_data_tmp,['median_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated']);
            %normalized data-quantile
            qcol_tmp = qcol_cache(4:5:end,:);
            qcol_data_tmp = dataset({qcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(qcol_data_tmp,['median_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_quantile']);
            %median
            mcol_tmp = mcol_cache(4:5:end,:);
            mcol_data_tmp = dataset({mcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(mcol_data_tmp,['median_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_median']);
        %output mad
        col_tmp = col_cache(5:5:end,:);
        col_name_tmp = col_name(5:5:end,:);
        col_data_tmp = dataset({col_tmp,dataset_name{:}},'obsname',col_name_tmp);
        sav2csv(col_data_tmp,['mad_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated']);
            %normalized data-quantile
            qcol_tmp = qcol_cache(5:5:end,:);
            qcol_data_tmp = dataset({qcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(qcol_data_tmp,['mad_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_quantile']);
            %median
            mcol_tmp = mcol_cache(5:5:end,:);
            mcol_data_tmp = dataset({mcol_tmp,dataset_name{:}},'obsname',col_name_tmp);
            sav2csv(mcol_data_tmp,['mad_col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_median']);
    else
        col_cache = reshape(data_col(:,l),[5,size(data_col,1)/5]);
        %norm data
        qcol_cache = reshape(qdata_col(:,l),[5,size(qdata_col,1)/5]);
        mcol_cache = reshape(mdata_col(:,l),[5,size(mdata_col,1)/5]);
    end
    col_data = dataset({col_cache,dataset_name{:}},'obsname',col_name);
    sav2csv(col_data,['col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated']);
    %norm data
    qcol_data = dataset({qcol_cache,dataset_name{:}},'obsname',col_name);
    sav2csv(qcol_data,['col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_quantile']);
    mcol_data = dataset({mcol_cache,dataset_name{:}},'obsname',col_name);
    sav2csv(mcol_data,['col_sum.prop',num2str(l),'.csv'],[dir_tmp,'\sum_collated_median']);
end
warning 'on'

%-----------------------------------------------------------------------------------------------------------------------------
function [filenames_out,uq_names,size_array] = dir_sort(dir_tmp,multiselect,filenames)
%give the directory and it will sort out the files in it and put a cell
%array of filenames
% filename_tmp = ls(dir_tmp);     %grab the filenames
% %truncate the first two rows which are . and ..
% filename_tmp = filename_tmp(3:end,:);
% strmask = isstrprop(filename_tmp,'punct');  %find the punctuations, we only want the last two
% for i = 1:size(filename_tmp,1)      %step through each filename and pull the wanted word
%     [x,y] = find(strmask(i,:)==1);  %get the positions
%     %if strcmp('csv',filename_tmp(i,y(end)+1:y(end)+3))  %we only care about the csv files for now.
%         tmp = filename_tmp(i,y(end-1)+1:y(end)-1);    %create an array of the words
%         %now lets consider the vertices case
%         if ~isempty(str2num(tmp))   %not empty = numbers or vertices file, push forward one and go
%             tmp = filename_tmp(i,y(end-2)+1:y(end-1)-1);    %create an array of the words
%         end
%         f_tmp{i,:} = tmp;
%     %end
% end
if ~multiselect
dir_struct = dir(dir_tmp);  %grab the directory information
idx = [dir_struct.isdir];   %grab all of the isdir numbers
names = {dir_struct.name};   %grab the all of the names in the root
filenames = names(~idx);
end

%filename_tmp = filename_tmp(3:end,:);
for i = 1:size(filenames,2)      %step through each filename and pull the wanted word
    if strcmp(filenames{i}(1,end-3:end),'.csv')     %we only want csv files
        filename_tmp = filenames{i};
        strmask = isstrprop(filename_tmp,'punct');  %find the punctuations, we only want the last two
        strmask2 = filename_tmp=='&';       %we are going to exempt the & character
        strmask3 = isstrprop(filename_tmp,'wspace');    %included the spaces as well
        strmask = strmask-strmask2+strmask3;     %removed and append
        [x,y] = find(strmask==1);  %get the positions
        %         l1 = 0;     %set bookends
        %         l2 = 1;
        %         tmp = filename_tmp(1,y(end-1)+1:y(end)-1);    %create an array of the words
        %         while ~isempty(str2num(tmp))    %lets get to the name and not just the numbers
        %             l1 = l1+1;
        %             l2 = l2+1;
        %             tmp = filename_tmp(1,y(end-l2)+1:y(end-l1)-1);    %create an array of the words
        %         end
        %         if size(y,2)<=5
        %             f_tmp = horzcat(f_tmp,{tmp});
        %         else    %too complicated take the whole thing
        %             f_tmp = horzcat(f_tmp,filename_tmp(1,y(2)+1:y(end-l1)-1));
        %         end
        a = 1;  %initiate
        b = 0;
        tmp = filename_tmp(1,y(end-a)+1:y(end-b)-1);    %create an array of the words
        while ~isempty(str2num(tmp))   %not empty = numbers or vertices file, push forward one and go
            a = a+1;
            b = b+1;
            if size(y,2)-a==0
                break
            end
            tmp = filename_tmp(1,y(end-a)+1:y(end-b)-1);    %create an array of the words
        end
        %now do the same for the front
        a = 0;
        tmp = filename_tmp(1,1:y(1+a)-1);
        while ~isempty(str2num(tmp))&&size(y,2)~=a+1   %not empty = numbers or vertices file, push forward one and go
            a = a+1;
            tmp = filename_tmp(1,a:y(1+a)-1);    %create an array of the words
        end
        if a==0
            f_tmp{i} = filename_tmp(1,1:y(end-b)-1);
        else
            f_tmp{i} = filename_tmp(1,y(a)+1:y(end-b)-1);
        end
    end
end
uq_names = unique(f_tmp);    %how many unique words are there
%make sure there are no duplicates with different cases
idx = [];   %index of removal
for l = 1:size(uq_names,2)      %step through unique names
    for m = 1:size(uq_names,2)     %step through the all file names
        if strcmpi(uq_names(1,l),uq_names(1,m)) && l<m    %if there is a match (case insensitive now) Remove
            idx = [idx;m];  %remove
        end
    end
end 
uq_names(unique(idx),:) = [];    %remove
for j = 1:size(uq_names,2)      %step through unique names
    tmp = [];
    for k = 1:size(f_tmp,2)     %step through the all file names
        if strcmpi(uq_names(1,j),f_tmp(1,k)) %if there is a match (case insensitive now)
            tmp = vertcat(tmp,filenames(k));  %put it in the current file names temp
        end
    end
    filenames_out{j} = char(tmp);
    size_array(j) = size(char(tmp),1);     %get the size of each file group
end
%-----------------------------------------------------------------------------------------------------------------------------
function [norm_data] = quantilenormloco(data)
%This little function makes sure quantile normalization works
%first check to make sure the input data does not have empty columns
data_chk = isnan(data);
chk_data = sum(data_chk,1);     %beging selection process for empty or singleton data columns
chk_data = abs(chk_data-size(data,1));  %nan items calculated
idx = find(chk_data<=1);    %index of the empty or singleton columns
if ~isempty(idx)     %there is an empty or singleton column, fix that
    norm_data = data;     %copy original matrix context for reconstitution
    reconidx = find(chk_data>1);   %index of reconstitution
    data(:,idx) = [];       %remove temporarily
    if ~isempty(data)   %if all is gone, don't do it
        norm_tmp = quantilenorm(data,'Median',1);     %normalize
        norm_data(:,reconidx) = norm_tmp;   %reconstituted in original contex
    end
else    %no problems go on
    norm_data = quantilenorm(data,'Median',1);
end